WGCNA

last updated: 2023-11-14

Install Packages

As usual, make sure we have the right packages for this exercise

if (!require("pacman")) install.packages("pacman"); library(pacman)
## Loading required package: pacman
# let's load all of the files we were using and want to have again today
p_load("tidyverse", "knitr", "readr",
       "pander", "BiocManager", 
       "dplyr", "stringr", 
       "statmod", # required dependency, need to load manually on some macOS versions.
       "Glimma", # beautifies limma results
       "purrr", # for working with lists (beautify column names)
       "reactable") # for pretty tables.

# for today:
# p_load("WGCNA")   # WGCNA is available on CRAN
if (!require("WGCNA")) BiocManager::install("WGCNA", force=TRUE); library(WGCNA)
## Loading required package: WGCNA
## Loading required package: dynamicTreeCut
## Loading required package: fastcluster
## 
## Attaching package: 'fastcluster'
## The following object is masked from 'package:stats':
## 
##     hclust
## 
## 
## Attaching package: 'WGCNA'
## The following object is masked from 'package:stats':
## 
##     cor
# 
# # We also need these Bioconductor packages today.
p_load("edgeR")
p_load("matrixStats")
p_load("org.Sc.sgd.db")
p_load("cowplot")
p_load("igraph")
# p_load("AnnotationDbi", "org.Sc.sgd.db", "ggVennDiagram")
# p_load("janitor")
# p_load("RColorBrewer")
# p_load("UpSetR")
# p_load("ComplexHeatmap")
# p_load("enrichplot")
# p_load("clusterProfiler")
# p_load("factoextra", "NbClust")
# p_load("ggpubr", "corrplot", "stringr", "tidyr")
# #NOTE: edgeR loads limma as a dependency
# library(ComplexHeatmap)
# library(RColorBrewer)
# library(limma)
# library(org.Sc.sgd.db)
# 
# # choose number clusters
# library(factoextra)
# library(NbClust)
# 
# # corr plots
# library(ggpubr)
# library(corrplot)
# library(stringr)
# library(tidyr)

Description

Use limma to identify differential patterns of proteomic and transcriptomic changes in a time series heat shock experiment.

Learning Objectives

At the end of this exercise, you should be able to:

  • Recognize the steps of DE analysis of Mass Spec data
  • Identify differentially Abundant proteins with limma
  • Better understand the temporal relationship between gene expression and protein abundance
# for ease of use, set max number of digits after decimal
options(digits=3)

Loading in the count data file

We are downloading the counts for the non-subsampled fastq files from a Github repository using the code below. Just as in previous exercises, assign the data to the variable counts. You can change the file path if you have saved it to your computer in a different location.

# proteome_HS <- read.delim("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/heat_shock/proteomic_FC/HS_Proteomics_Data.txt",
#                           sep="\t",
#                           header=T,
#                           row.names=1)

# Load gene file used of raw counts 
counts <- read.delim('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/YPS606_TF_Rep1_4_ExpectedCounts.txt',
    sep = "\t",
    header = T,
    row.names = 1
  )
col_sel = names(counts)     # Get all but first column name
mdata <- counts %>%
  rownames_to_column("gene_id") %>%
  tidyr::pivot_longer(
    .,                        # The dot is the the input data, magrittr tutorial
    col = all_of(col_sel)
    ) %>%  
  separate_wider_delim(name, delim = "_", names = c("strain", "stress", "replicate"),cols_remove = FALSE)

(
p <- mdata %>%
    ggplot(., aes(x = replicate, y = value)) +             # x = treatment, y = RNA Seq count
    geom_violin() +                                   # violin plot, show distribution
    geom_point(alpha = 0.2) +                         # scatter plot
    theme_bw() +
    theme(
      axis.text.x = element_text(angle = 90)          # Rotate treatment text
    ) +
    labs(x = "Treatment Groups", y = "RNA Seq Counts") +
    facet_grid(cols = vars(stress),rows = vars(strain), drop = TRUE, scales = "free_x")      # Facet by hour
)

Let’s use edgeR to normalize.

# read into a DGElist
y <- edgeR::DGEList(counts,
                    group=as.factor(
                      # remove replicate from name
                      sub("_[^_]+$", "", colnames(counts))
                    ),
                    genes = rownames(counts)
                    )


# filter low counts
keep <- rowSums(cpm(y) > 1) >= 4
y <- y[keep,]

# normalize with cpm
normalized_counts <- cpm(y,
                         log = TRUE)
str(normalized_counts)
##  num [1:5756, 1:48] 4.59 5.07 10.95 0.18 8.03 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:5756] "YAL001C" "YAL002W" "YAL003W" "YAL004W" ...
##   ..$ : chr [1:48] "YPS606_Mock_Rep1" "YPS606_Mock_Rep2" "YPS606_Mock_Rep3" "YPS606_Mock_Rep4" ...
# get th row 
rv_wpn <- matrixStats::rowVars(normalized_counts)

summary(rv_wpn)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.01    0.15    0.31    0.79    0.74   20.77
# get variance corresponding to 95th percentile
q95_wpn <- quantile( matrixStats::rowVars(normalized_counts), .95)  # <= changed to 95 quantile 
# get variance corresponding to 97.5th percentile
q975_wpn <- quantile( matrixStats::rowVars(normalized_counts), .975) 

# to reduce dataset, only get genes with variance greater than above
expr_normalized <- normalized_counts[ rv_wpn > q975_wpn, ]

str(expr_normalized)
##  num [1:144, 1:48] 9.196 7.176 6.818 0.763 -0.927 ...
##  - attr(*, "dimnames")=List of 2
##   ..$ : chr [1:144] "YBL039C" "YBL042C" "YBL054W" "YBR072W" ...
##   ..$ : chr [1:48] "YPS606_Mock_Rep1" "YPS606_Mock_Rep2" "YPS606_Mock_Rep3" "YPS606_Mock_Rep4" ...

Notice the number of genes retained is much fewer now.

What are the distributions of counts of the normalized cpm’s of the retained genes?

expr_normalized_df <- data.frame(expr_normalized) %>%
  mutate(
    Gene_id = row.names(expr_normalized)
  ) %>%
  pivot_longer(-Gene_id)

expr_normalized_df %>% ggplot(., aes(x = name, y = value)) +
  geom_violin() +
  geom_point(alpha=0.5) +
  theme_bw() +
  theme(
    axis.text.x = element_text( angle = 90)
  ) +
  ylim(0, NA) +
  labs(
    title = "Normalized and 95 quantile Expression",
    x = "treatment",
    y = "normalized expression"
  )
## Warning: Removed 746 rows containing non-finite values (`stat_ydensity()`).
## Warning: Removed 746 rows containing missing values (`geom_point()`).

Now let’s transpose the data and prepare the dataset for WGCNA.

input_mat = t(expr_normalized)

# Look at first 5 rows and 10 columns
input_mat[1:5,1:10]           
##                  YBL039C YBL042C YBL054W YBR072W YBR116C YBR117C YBR126C
## YPS606_Mock_Rep1    9.20    7.18   6.818   0.763  -0.927   0.856    6.22
## YPS606_Mock_Rep2    9.35    7.04   7.120   0.412  -0.761   0.998    6.19
## YPS606_Mock_Rep3    9.11    7.16   6.537   0.801   0.234   1.064    6.20
## YPS606_Mock_Rep4    9.33    7.08   6.860   0.847  -1.379   1.198    6.30
## YPS606_EtOH_Rep1    1.80    1.55   0.401   9.044   0.569   3.295   11.66
##                  YBR145W YBR147W YBR169C
## YPS606_Mock_Rep1    2.92   -1.00    1.73
## YPS606_Mock_Rep2    3.14   -1.18    1.77
## YPS606_Mock_Rep3    2.71   -1.46    2.10
## YPS606_Mock_Rep4    2.84   -1.90    1.89
## YPS606_EtOH_Rep1   10.48    2.77    7.80

We can see now that the rows = treatments and columns = gene probes. We’re ready to start WGCNA. A correlation network will be a complete network (all genes are connected to all other genes). Ergo we will need to pick a threshhold value (if correlation is below threshold, remove the edge). We assume the true biological network follows a scale-free structure (see papers by Albert Barabasi).

To do that, WGCNA will try a range of soft thresholds and create a diagnostic plot. This step will take several minutes so feel free to run and get coffee.

# library(WGCNA)
allowWGCNAThreads(nThreads = 4)          # allow multi-threading (optional)
## Allowing multi-threading with up to 4 threads.
#> Allowing multi-threading with up to 4 threads.

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))

# Call the network topology analysis function
sft = pickSoftThreshold(
  input_mat,             # <= Input data
  #blockSize = 30,
  powerVector = powers,
  verbose = 5
  )
## pickSoftThreshold: will use block size 144.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 144 of 144
##    Power SFT.R.sq   slope truncated.R.sq mean.k. median.k. max.k.
## 1      1   0.4920  1.7600       6.60e-01    86.9     95.30  106.0
## 2      2   0.5340  0.7260       7.30e-01    65.4     74.90   89.4
## 3      3   0.5560  0.5630       6.18e-01    52.9     61.40   77.7
## 4      4   0.3590  0.4430       3.52e-01    44.4     51.10   68.6
## 5      5   0.3540  0.3580       3.26e-01    38.1     43.70   61.2
## 6      6   0.3710  0.2900       3.77e-01    33.3     37.90   55.0
## 7      7   0.4270  0.2790       4.84e-01    29.5     33.20   49.8
## 8      8   0.2720  0.2380       2.61e-01    26.4     29.20   45.3
## 9      9   0.2450  0.1960       2.74e-01    23.8     26.10   41.5
## 10    10   0.1280  0.1400       1.04e-01    21.6     24.00   38.5
## 11    12   0.0253  0.0391       4.58e-05    18.1     19.70   34.9
## 12    14   0.0871 -0.0752      -1.59e-01    15.5     16.00   32.0
## 13    16   0.3840 -0.1700       2.23e-01    13.4     13.20   29.6
## 14    18   0.6510 -0.2600       6.30e-01    11.8     11.10   27.6
## 15    20   0.5960 -0.3890       5.70e-01    10.4      9.62   25.9
par(mfrow = c(1,2));
cex1 = 0.9;

plot(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit, signed R^2",
     main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity",
     type = "n",
     main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     labels = powers,
     cex = cex1, col = "red")

Pick a soft threshold power near the curve of the plot, so here we could pick 7, 8 or 9. We’ll pick 9 but feel free to experiment with other powers to see how it affects your results. Now we can create the network using the blockwiseModules command. The blockwiseModule may take a while to run, since it is constructing the TOM (topological overlap matrix) and several other steps. While it runs, take a look at the blockwiseModule documentation (link to vignette[https://www.rdocumentation.org/packages/WGCNA/versions/1.69/topics/blockwiseModules]) for more information on the parameters. How might you change the parameters to get more or less modules?

picked_power = 7
temp_cor <- cor       
cor <- WGCNA::cor         # Force it to use WGCNA cor function (fix a namespace conflict issue)
netwk <- blockwiseModules(input_mat,                # <= input here

                          # == Adjacency Function ==
                          power = picked_power,                # <= power here
                          networkType = "signed",

                          # == Tree and Block Options ==
                          deepSplit = 2,
                          pamRespectsDendro = F,
                          # detectCutHeight = 0.75,
                          minModuleSize = 30,
                          maxBlockSize = 4000,

                          # == Module Adjustments ==
                          reassignThreshold = 0,
                          mergeCutHeight = 0.25,

                          # == TOM == Archive the run results in TOM file (saves time)
                          saveTOMs = T,
                          saveTOMFileBase = "ER",

                          # == Output Options
                          numericLabels = T,
                          verbose = 3)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will use 4 parallel threads.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file ER-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 3 genes from module 2 because their KME is too low.
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.25
##        Calculating new MEs...
cor <- temp_cor
# Convert labels to colors for plotting
mergedColors = labels2colors(netwk$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
  netwk$dendrograms[[1]],
  mergedColors[netwk$blockGenes[[1]]],
  "Module colors",
  dendroLabels = FALSE,
  hang = 0.03,
  addGuide = TRUE,
  guideHang = 0.05 )

module_df <- data.frame(
  gene_id = names(netwk$colors),
  colors = labels2colors(netwk$colors)
)
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes

# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)

# Add treatment names
MEs0$treatment = row.names(MEs0)

# tidy & plot data
mME = MEs0 %>%
  pivot_longer(-treatment) %>%
  mutate(
    name = gsub("ME", "", name),
    name = factor(name, levels = module_order)
  )

mME %>% ggplot(., aes(x=treatment, y=name, fill=value)) +
  geom_tile() +
  theme_bw() +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 0,
    limit = c(-1,1)) +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title = "Module-trait Relationships", y = "Modules", fill="corr")

# pick out a few modules of interest here
modules_of_interest = c("turquoise", "blue", "brown", "grey")

# Pull out list of genes in that module
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

# Get normalized expression for those genes
subexpr = expr_normalized[submod$gene_id,]

submod_df = data.frame(subexpr) %>%
  mutate(
    gene_id = row.names(.)
  ) %>%
  pivot_longer(-gene_id) %>%
  mutate(
    module = module_df[gene_id,]$colors
  )

submod_df %>% ggplot(., aes(x=name, y=value, group=gene_id)) +
  geom_line(aes(color = module),
            alpha = 0.2) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  facet_grid(rows = vars(module), 
             # scales = "free_y"
             ) +
  labs(x = "treatment",
       y = "normalized expression") +
  scale_color_identity() + 
  scale_y_log10()
## Warning in self$trans$transform(x): NaNs produced
## Warning: Transformation introduced infinite values in continuous y-axis
## Warning: Removed 98 rows containing missing values (`geom_line()`).

Generate and Export Networks

The network file can be generated for Cytoscape or as an edge/vertices file.

genes_of_interest = module_df %>%
  subset(colors %in% modules_of_interest)

expr_of_interest = expr_normalized[genes_of_interest$gene_id,]
expr_of_interest[1:5,1:5]
##         YPS606_Mock_Rep1 YPS606_Mock_Rep2 YPS606_Mock_Rep3 YPS606_Mock_Rep4
## YBL039C            9.196            9.345            9.109            9.328
## YBL042C            7.176            7.039            7.164            7.077
## YBL054W            6.818            7.120            6.537            6.860
## YBR072W            0.763            0.412            0.801            0.847
## YBR116C           -0.927           -0.761            0.234           -1.379
##         YPS606_EtOH_Rep1
## YBL039C            1.804
## YBL042C            1.550
## YBL054W            0.401
## YBR072W            9.044
## YBR116C            0.569
# Only recalculate TOM for modules of interest (faster, altho there's some online discussion if this will be slightly off)
TOM = TOMsimilarityFromExpr(t(expr_of_interest),
                            power = picked_power)
## TOM calculation: adjacency..
## ..will use 4 parallel threads.
##  Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = row.names(expr_of_interest)
colnames(TOM) = row.names(expr_of_interest)

edge_list = data.frame(TOM) %>%
  mutate(
    gene1 = row.names(.)
  ) %>%
  pivot_longer(-gene1) %>%
  dplyr::rename(gene2 = name, correlation = value) %>%
  unique() %>%
  subset(!(gene1==gene2)) %>%
  mutate(
    module1 = module_df[gene1,]$colors,
    module2 = module_df[gene2,]$colors
  )

head(edge_list)
## # A tibble: 6 Ă— 5
##   gene1   gene2   correlation module1 module2  
##   <chr>   <chr>         <dbl> <chr>   <chr>    
## 1 YBL039C YBL042C    0.695    blue    blue     
## 2 YBL039C YBL054W    0.690    blue    blue     
## 3 YBL039C YBR072W    0.565    blue    turquoise
## 4 YBL039C YBR116C    0.000369 blue    brown    
## 5 YBL039C YBR117C    0.00224  blue    brown    
## 6 YBL039C YBR126C    0.243    blue    turquoise
adjacency = adjacency(input_mat, power=12, type="signed")
adjacency[adjacency < 0] = 0
adjacency[adjacency > 1] = 1
TOM = TOMsimilarity(adjacency, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
  is.na(GENENAME) ~ ORF,
  TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
colnames(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
  is.na(GENENAME) ~ ORF,
  TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
adj <- TOM
adj[adj > 0.5] = 1
adj[adj != 1] = 0
network <- graph.adjacency(adj)
network <- simplify(network,)  # removes self-loops
cor <- WGCNA::cor
results <- WGCNA::blockwiseModules(input_mat, power=12, TOMType="signed", networkType="signed")
cor <- stats::cor
V(network)$color <- results$colors
par(mar=c(0,0,0,0))
# remove unconnected nodes
network <- delete.vertices(network, degree(network)==0)
plot(network, layout=layout.fruchterman.reingold(network, niter=10000),type="o")

plot(network, layout=layout_with_mds(network), edge.arrow.size = 0.1)

use logFC data instead

FC_list <- read_delim("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/DE_yeast_TF_stress.txt.gz",
    delim = "\t", escape_double = FALSE, 
    name_repair = "universal",
    show_col_types=FALSE,
    trim_ws = TRUE) %>%
  dplyr::select(ID, contains("logFC")) %>%
  dplyr::select(ID, contains(".v.")) %>%
  column_to_rownames("ID")
## New names:
## • `logFC: WT NaCl response` -> `logFC..WT.NaCl.response`
## • `FDR: WT NaCl response` -> `FDR..WT.NaCl.response`
## • `logFC: msn24 mutant NaCl response` -> `logFC..msn24.mutant.NaCl.response`
## • `FDR: msn24 mutant NaCl response` -> `FDR..msn24.mutant.NaCl.response`
## • `logFC: yap1 mutant NaCl response` -> `logFC..yap1.mutant.NaCl.response`
## • `FDR: yap1 mutant NaCl response` -> `FDR..yap1.mutant.NaCl.response`
## • `logFC: skn7 mutant NaCl response` -> `logFC..skn7.mutant.NaCl.response`
## • `FDR: skn7 mutant NaCl response` -> `FDR..skn7.mutant.NaCl.response`
## • `logFC: WT EtOH response` -> `logFC..WT.EtOH.response`
## • `FDR: WT EtOH response` -> `FDR..WT.EtOH.response`
## • `logFC: msn24 mutant EtOH response` -> `logFC...msn24.mutant.EtOH.response`
## • `FDR: msn24 mutan EtOH response` -> `FDR...msn24.mutan..EtOH.response`
## • `logFC: yap1 mutant EtOH response` -> `logFC...yap1.mutant.EtOH.response`
## • `FDR: yap1 mutant EtOH response` -> `FDR...yap1.mutant.EtOH.response`
## • `logFC: skn7 mutant EtOH response` -> `logFC...skn7.mutant.EtOH.response`
## • `FDR: skn7 mutant EtOH response` -> `FDR...skn7.mutant.EtOH.response`
## • `logFC: WT v msn24 mutant: NaCl response` ->
##   `logFC..WT.v.msn24.mutant..NaCl.response`
## • `FDR: WT v msn24 mutant: NaCl response` ->
##   `FDR..WT.v.msn24.mutant..NaCl.response`
## • `logFC: WT v yap1 mutant: NaCl response` ->
##   `logFC..WT.v.yap1.mutant..NaCl.response`
## • `FDR: WT v yap1 mutant: NaCl response` ->
##   `FDR..WT.v.yap1.mutant..NaCl.response`
## • `logFC: WT v skn7 mutant: NaCl response` ->
##   `logFC..WT.v.skn7.mutant..NaCl.response`
## • `FDR: WT v skn7 mutant NaCl response` ->
##   `FDR..WT.v.skn7.mutant.NaCl.response`
## • `logFC: WT v msn24 mutant: EtOH response` ->
##   `logFC..WT.v.msn24.mutant..EtOH.response`
## • `FDR: WT v msn24 mutant: EtOH response` ->
##   `FDR..WT.v.msn24.mutant..EtOH.response`
## • `logFC: WT v yap1 mutant: EtOH response` ->
##   `logFC..WT.v.yap1.mutant..EtOH.response`
## • `FDR: WT v yap1 mutant: EtOH response` ->
##   `FDR..WT.v.yap1.mutant..EtOH.response`
## • `logFC: WT v skn7 mutant: EtOH response` ->
##   `logFC..WT.v.skn7.mutant..EtOH.response`
## • `FDR: WT v skn7 mutant: EtOH response` ->
##   `FDR..WT.v.skn7.mutant..EtOH.response`
FC_list_t <- t(FC_list)
cv_wpn <- matrixStats::colVars(FC_list_t)
# get variance corresponding to 95th percentile
q95_wpn <- quantile( matrixStats::colVars(FC_list_t), .75)  # <= changed to 95 quantile 

# to reduce dataset, only get genes with variance greater than above
FC_list_t <- FC_list_t[, cv_wpn > q95_wpn ]

adjacency = adjacency(FC_list_t, power=12, type="signed")
adjacency[adjacency < 0] = 0
adjacency[adjacency > 1] = 1
TOM = TOMsimilarity(adjacency, TOMType="unsigned")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
  is.na(GENENAME) ~ ORF,
  TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
colnames(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
  is.na(GENENAME) ~ ORF,
  TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
adj <- TOM
adj[adj > 0.3] = 1
adj[adj != 1] = 0
network <- graph.adjacency(adj)
network <- simplify(network)  # removes self-loops
cor <- WGCNA::cor
results <- WGCNA::blockwiseModules(FC_list_t, power=12, TOMType="unsigned", networkType="unsigned")
cor <- stats::cor
V(network)$color <- results$colors
par(mar=c(0,0,0,0))
# remove unconnected nodes
network <- delete.vertices(network, degree(network)==0)
# network <- delete.vertices(network, V(network)$color=="blue")
# network <- delete.vertices(network, V(network)$color=="yellow")
plot(network, layout=layout.fruchterman.reingold(network, niter=1000),
     vertex.size=3,edge.arrow.size=0.1, vertex.label.cex = 0.05,  vertex.label.color= "black", rescale=TRUE, ylim=c(-1,1),xlim=c(-1,1))

# plot(network, layout=layout_with_mds(network), vertex.label.cex = 0.5,
#      edge.arrow.size = 0.1, vertex.size=7,rescale=FALSE)

# plot(network, layout=layout_on_grid(network),
#      vertex.size=5,edge.arrow.size=0.1, vertex.label.cex = 0.5,rescale=TRUE)#, ylim=c(-5,-2),xlim=c(-8,5), asp = 1)
module_df <- data.frame(
  gene_id = names(results$colors),
  colors = labels2colors(results$colors)
)

# Convert labels to colors for plotting
mergedColors = labels2colors(results$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
  results$dendrograms[[1]],
  mergedColors[results$blockGenes[[1]]],
  "Module colors",
  dendroLabels = FALSE,
  hang = 0.03,
  addGuide = TRUE,
  guideHang = 0.05 )

# pick out a few modules of interest here
modules_of_interest = c("midnightblue", "lightyellow", "cyan", "blue", "salmon", "turquoise", "royalblue", "purple", "lightcyan", "red", "lightgreen", "yellow", "grey60", "green", "brown", "tan", "pink", "greenyellow", "magenta", "black")

# Pull out list of genes in that module
submod = module_df %>%
  subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

# Get normalized expression for those genes
subexpr = FC_list[submod$gene_id,]

submod_df = data.frame(subexpr) %>%
  mutate(
    gene_id = row.names(.)
  ) %>%
  pivot_longer(-gene_id) %>%
  mutate(
    module = module_df[gene_id,]$colors
  )

submod_df %>% ggplot(., aes(x=name, y=value, group=gene_id)) +
  geom_line(aes(color = module),
            alpha = 0.2) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 40)
  ) +
  facet_grid(rows = vars(module), 
             # scales = "free_y"
             ) +
  scale_color_identity() + 
  labs(x = "treatment",
       y = "normalized expression")# +

  # scale_y_log10()
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(FC_list_t, mergedColors)$eigengenes

# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)

# Add treatment names
MEs0$treatment = row.names(MEs0)

# tidy & plot data
mME = MEs0 %>%
  pivot_longer(-treatment) %>%
  mutate(
    name = gsub("ME", "", name),
    name = factor(name, levels = module_order)
  )

mME %>% ggplot(., aes(x=treatment, y=name, fill=value)) +
  geom_tile() +
  theme_bw() +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 0,
    limit = c(-1,1)) +
  theme(axis.text.x = element_text(angle=20)) +
  labs(title = "Module-trait Relationships", y = "Modules", fill="corr")

GO enrichment on those modules

go_results_yellow <- clusterProfiler::enrichGO(
  gene = submod_df %>% filter(module=="lightyellow") %>% pull(gene_id) %>% unique(),
  OrgDb = "org.Sc.sgd.db",
  keyType = "ORF",
  ont = "ALL"
) |>
  # let's add a 'richFactor' column that gives us the proportion of genes DE in the term
  mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))

# take a peak at the results
head(go_results_yellow)
##            ONTOLOGY         ID
## GO:0006091       BP GO:0006091
## GO:0015980       BP GO:0015980
## GO:0006979       BP GO:0006979
## GO:0034599       BP GO:0034599
## GO:0005975       BP GO:0005975
## GO:0062197       BP GO:0062197
##                                                    Description GeneRatio
## GO:0006091      generation of precursor metabolites and energy    36/289
## GO:0015980 energy derivation by oxidation of organic compounds    27/289
## GO:0006979                        response to oxidative stress    22/289
## GO:0034599               cellular response to oxidative stress    20/289
## GO:0005975                      carbohydrate metabolic process    31/289
## GO:0062197                cellular response to chemical stress    24/289
##             BgRatio       pvalue     p.adjust       qvalue
## GO:0006091 226/6424 1.526607e-11 1.581565e-08 1.446260e-08
## GO:0015980 164/6424 3.020572e-09 1.564656e-06 1.430797e-06
## GO:0006979 127/6424 3.626456e-08 1.252336e-05 1.145197e-05
## GO:0034599 111/6424 7.853587e-08 2.034079e-05 1.860060e-05
## GO:0005975 245/6424 1.238047e-07 2.565233e-05 2.345773e-05
## GO:0062197 161/6424 1.692218e-07 2.921896e-05 2.671922e-05
##                                                                                                                                                                                                                                                                                                       geneID
## GO:0006091 YBR026C/YCL035C/YDL021W/YDR148C/YDR516C/YEL011W/YEL039C/YER054C/YER067W/YFR015C/YFR017C/YFR053C/YGL191W/YGR043C/YGR248W/YHR001W-A/YIL111W/YIL125W/YJL052W/YJL166W/YJR121W/YKL035W/YKL085W/YKL148C/YLR258W/YML070W/YML110C/YML120C/YMR081C/YMR105C/YMR145C/YMR315W/YOR178C/YOR347C/YPL172C/YPR160W
## GO:0015980                                                                         YBR026C/YDR148C/YEL011W/YEL039C/YER054C/YER067W/YFR015C/YFR017C/YGL191W/YHR001W-A/YIL111W/YIL125W/YJL166W/YJR121W/YKL035W/YKL085W/YKL148C/YLR258W/YML070W/YML110C/YML120C/YMR081C/YMR105C/YMR145C/YOR178C/YPL172C/YPR160W
## GO:0006979                                                                                                                   YBL064C/YBR046C/YBR126C/YCL035C/YCR083W/YDL124W/YDR231C/YFL014W/YFR014C/YGR043C/YGR088W/YHR008C/YHR104W/YJR096W/YKL026C/YKL062W/YKL150W/YKR066C/YLR109W/YML028W/YMR037C/YMR250W
## GO:0034599                                                                                                                                   YBL064C/YBR046C/YBR126C/YCL035C/YCR083W/YDL124W/YFL014W/YFR014C/YGR043C/YHR008C/YHR104W/YJR096W/YKL026C/YKL062W/YKL150W/YKR066C/YLR109W/YML028W/YMR037C/YMR250W
## GO:0005975                                           YBR001C/YBR053C/YBR105C/YBR126C/YDL021W/YDR001C/YDR516C/YEL011W/YER054C/YFR015C/YFR017C/YFR053C/YGR043C/YGR143W/YGR248W/YHR104W/YJL052W/YJR096W/YKL035W/YKL085W/YLR258W/YML028W/YML070W/YML100W/YMR105C/YMR196W/YMR261C/YMR305C/YOR178C/YOR347C/YPR160W
## GO:0062197                                                                                                   YAL005C/YAL028W/YBL064C/YBR046C/YBR072W/YBR126C/YCL035C/YCR083W/YDL124W/YFL014W/YFR014C/YGR043C/YHR008C/YHR104W/YJR096W/YKL026C/YKL062W/YKL150W/YKR066C/YLR109W/YML028W/YMR037C/YMR250W/YNL234W
##            Count richFactor
## GO:0006091    36  0.1592920
## GO:0015980    27  0.1646341
## GO:0006979    22  0.1732283
## GO:0034599    20  0.1801802
## GO:0005975    31  0.1265306
## GO:0062197    24  0.1490683
ggplot(go_results_yellow,
       showCategory = 15,
       aes(richFactor, fct_reorder(Description, richFactor))) +
  geom_segment(aes(xend = 0, yend = Description)) +
  geom_point(aes(color = p.adjust, size = Count)) +
  scale_color_gradientn(
    colours = c("#f7ca64", "#46bac2", "#7e62a3"),
    trans = "log10",
    guide = guide_colorbar(reverse = TRUE, order = 1)
  ) +
  scale_size_continuous(range = c(2, 10)) +
  xlab("Rich Factor") +
  ylab(NULL) +
  ggtitle("Enriched GO Categories") +
  theme_bw()

# create list of genes in each one
ID_network_list <- submod_df %>%
  # data.frame() %>%
  # tibble::rownames_to_column("ORF") %>%
  # tidyr::pivot_longer(-ORF,names_to="Time", values_to = "DE_direction") %>%
  # mutate(Time = readr::parse_number(Time),
  #        DE_name = case_when(DE_direction == 1 ~ "up",
  #                            DE_direction == -1 ~ "down",
  #                            TRUE~NA),
  #        # create grouping variable
  #        group=paste0("DE_",DE_name,"_0",Time, "min")) %>%
  # # modify names to pad leading 0's to fix time order
  # mutate(group = str_replace(group,"_0120", "_120")) %>%
  # mutate(group = str_replace(group,"_0240", "_240")) %>%
  # mutate(entrezID=AnnotationDbi::select(org.Sc.sgd.db,keys=ORF,columns="ENTREZID",
  #     multiVals = "first")$ENTREZID) %>%
  group_by(module) %>%
  # only get genes that are DE
  # filter(DE_direction != 0) %>%
  group_split() %>% # split into many lists
  purrr::set_names(purrr::map_chr(., ~.x$module[1])) %>% # assign names
  # get just the ORF names
  map(., ~ .x |> pull(all_of("gene_id")))


GO_enrich_network_all <- clusterProfiler::compareCluster(
                                  geneCluster   = ID_network_list, 
                                  ont           = "BP",
                                  OrgDb         = org.Sc.sgd.db,
                                  pAdjustMethod = "BH",
                                  pvalueCutoff  = 0.05,
                                  qvalueCutoff  = 0.05,
                                  readable      = FALSE,
                                  fun           = 'enrichGO',
                                  keyType       = 'ORF'
                                  )  |>
  # let's add a 'richFactor' column that gives us the proportion of genes DE in the term
  mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))


clusterProfiler::dotplot(GO_enrich_network_all,
        showCategory=10
        # if we choose "ALL", we can show them all with facet.
        #facet="ONTOLOGY",
        ) + scale_x_discrete(guide = guide_axis(angle = 60))

# get rid of sticks to save space
GO_enrich_network_all %>%
# GO_enrich_proteome_all %>%
  as.data.frame() %>% 
  group_by(Cluster) %>%
  slice_min(order_by=p.adjust, n = 8) %>%
  # separate_wider_delim(Cluster, delim = "_", names = c("junk","DE", "time")) %>%
  # mutate(Time = readr::parse_number(time)) %>%
  ggplot(.,
           aes(Cluster, fct_reorder(Description, richFactor))) +
      geom_point(aes(color = p.adjust, size = Count)) +
      scale_color_gradientn(
        colours = c("#f7ca64", "#46bac2", "#7e62a3"),
        trans = "log10",
        guide = guide_colorbar(reverse = TRUE, order = 1)
      ) +
      scale_size_continuous(range = c(2, 12)) +
      scale_y_discrete(labels = function(x) str_wrap(x, width = 25)) +
      xlab("Time (minutes)") +
      ylab(NULL) +
      ggtitle("Enriched GO Categories") +
      # facet_wrap(~DE, scales = "free_y") +
      theme_bw() +
      theme(text = element_text(size=14))

I want to use just the counts, and build a network for each different knockout+stress combination? Could group either by mutant, stress, or both. It feels like both makes most sense.

Network using only DE genes

# DE_genes
DE_genes <- read_delim("https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/DE_yeast_TF_stress.txt.gz",
    delim = "\t", escape_double = FALSE, 
    name_repair = "universal",
    show_col_types=FALSE,
    trim_ws = TRUE) %>%
  dplyr::select(ID, contains("FDR")) %>%
  dplyr::select(ID, contains(".v.")) %>%
  pivot_longer(-ID, names_to = "contrast", values_to = "FDR") %>%
  filter(FDR < 0.01) %>%
  pull(ID) %>%
  unique()
## New names:
## • `logFC: WT NaCl response` -> `logFC..WT.NaCl.response`
## • `FDR: WT NaCl response` -> `FDR..WT.NaCl.response`
## • `logFC: msn24 mutant NaCl response` -> `logFC..msn24.mutant.NaCl.response`
## • `FDR: msn24 mutant NaCl response` -> `FDR..msn24.mutant.NaCl.response`
## • `logFC: yap1 mutant NaCl response` -> `logFC..yap1.mutant.NaCl.response`
## • `FDR: yap1 mutant NaCl response` -> `FDR..yap1.mutant.NaCl.response`
## • `logFC: skn7 mutant NaCl response` -> `logFC..skn7.mutant.NaCl.response`
## • `FDR: skn7 mutant NaCl response` -> `FDR..skn7.mutant.NaCl.response`
## • `logFC: WT EtOH response` -> `logFC..WT.EtOH.response`
## • `FDR: WT EtOH response` -> `FDR..WT.EtOH.response`
## • `logFC: msn24 mutant EtOH response` -> `logFC...msn24.mutant.EtOH.response`
## • `FDR: msn24 mutan EtOH response` -> `FDR...msn24.mutan..EtOH.response`
## • `logFC: yap1 mutant EtOH response` -> `logFC...yap1.mutant.EtOH.response`
## • `FDR: yap1 mutant EtOH response` -> `FDR...yap1.mutant.EtOH.response`
## • `logFC: skn7 mutant EtOH response` -> `logFC...skn7.mutant.EtOH.response`
## • `FDR: skn7 mutant EtOH response` -> `FDR...skn7.mutant.EtOH.response`
## • `logFC: WT v msn24 mutant: NaCl response` ->
##   `logFC..WT.v.msn24.mutant..NaCl.response`
## • `FDR: WT v msn24 mutant: NaCl response` ->
##   `FDR..WT.v.msn24.mutant..NaCl.response`
## • `logFC: WT v yap1 mutant: NaCl response` ->
##   `logFC..WT.v.yap1.mutant..NaCl.response`
## • `FDR: WT v yap1 mutant: NaCl response` ->
##   `FDR..WT.v.yap1.mutant..NaCl.response`
## • `logFC: WT v skn7 mutant: NaCl response` ->
##   `logFC..WT.v.skn7.mutant..NaCl.response`
## • `FDR: WT v skn7 mutant NaCl response` ->
##   `FDR..WT.v.skn7.mutant.NaCl.response`
## • `logFC: WT v msn24 mutant: EtOH response` ->
##   `logFC..WT.v.msn24.mutant..EtOH.response`
## • `FDR: WT v msn24 mutant: EtOH response` ->
##   `FDR..WT.v.msn24.mutant..EtOH.response`
## • `logFC: WT v yap1 mutant: EtOH response` ->
##   `logFC..WT.v.yap1.mutant..EtOH.response`
## • `FDR: WT v yap1 mutant: EtOH response` ->
##   `FDR..WT.v.yap1.mutant..EtOH.response`
## • `logFC: WT v skn7 mutant: EtOH response` ->
##   `logFC..WT.v.skn7.mutant..EtOH.response`
## • `FDR: WT v skn7 mutant: EtOH response` ->
##   `FDR..WT.v.skn7.mutant..EtOH.response`
counts_DE <- read.delim('https://github.com/clstacy/GenomicDataAnalysis_Fa23/raw/main/data/YPS606_TF_Rep1_4_ExpectedCounts.txt',
    sep = "\t",
    header = T,
    row.names = 1
  ) %>%
  rownames_to_column("ORF") %>%
  dplyr::filter(ORF %in% DE_genes) %>%
  column_to_rownames("ORF")



# read into a DGElist
y_DE <- edgeR::DGEList(counts_DE,
                    group=as.factor(
                      # remove replicate from name
                      sub("_[^_]+$", "", colnames(counts_DE))
                    ),
                    genes = rownames(counts_DE)
                    )


# filter low counts
keep_DE <- rowSums(cpm(y_DE) > 1) >= 4
y_DE <- y_DE[keep_DE,]

# normalize with cpm
normalized_counts_DE <- cpm(y_DE,
                         log = FALSE)

expr_normalized_DE <- normalized_counts_DE#[ rv_wpn > q975_wpn, ]


input_mat = t(expr_normalized_DE)

picked_power = 7
temp_cor <- cor       
cor <- WGCNA::cor         # Force it to use WGCNA cor function (fix a namespace conflict issue)
netwk <- blockwiseModules(input_mat,                # <= input here

                          # == Adjacency Function ==
                          power = picked_power,                # <= power here
                          networkType = "signed",

                          # == Tree and Block Options ==
                          deepSplit = 2,
                          pamRespectsDendro = F,
                          # detectCutHeight = 0.75,
                          minModuleSize = 30,
                          maxBlockSize = 4000,

                          # == Module Adjustments ==
                          reassignThreshold = 0,
                          mergeCutHeight = 0.25,

                          # == TOM == Archive the run results in TOM file (saves time)
                          saveTOMs = T,
                          saveTOMFileBase = "ER",

                          # == Output Options
                          numericLabels = T,
                          verbose = 3)
##  Calculating module eigengenes block-wise from all genes
##    Flagging genes and samples with too many missing values...
##     ..step 1
##  ..Working on block 1 .
##     TOM calculation: adjacency..
##     ..will use 4 parallel threads.
##      Fraction of slow calculations: 0.000000
##     ..connectivity..
##     ..matrix multiplication (system BLAS)..
##     ..normalization..
##     ..done.
##    ..saving TOM for block 1 into file ER-block.1.RData
##  ....clustering..
##  ....detecting modules..
##  ....calculating module eigengenes..
##  ....checking kME in modules..
##      ..removing 20 genes from module 1 because their KME is too low.
##      ..removing 5 genes from module 2 because their KME is too low.
##      ..removing 6 genes from module 3 because their KME is too low.
##  ..merging modules that are too close..
##      mergeCloseModules: Merging modules whose distance is less than 0.25
##        Calculating new MEs...
cor <- temp_cor
# Convert labels to colors for plotting
mergedColors = labels2colors(netwk$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
  netwk$dendrograms[[1]],
  mergedColors[netwk$blockGenes[[1]]],
  "Module colors",
  dendroLabels = FALSE,
  hang = 0.03,
  addGuide = TRUE,
  guideHang = 0.05 )

module_df <- data.frame(
  gene_id = names(netwk$colors),
  colors = labels2colors(netwk$colors)
)
# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes

# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)

# Add treatment names
MEs0$treatment = row.names(MEs0)

# tidy & plot data
mME = MEs0 %>%
  pivot_longer(-treatment) %>%
  mutate(
    name = gsub("ME", "", name),
    name = factor(name, levels = module_order)
  )

mME %>% ggplot(., aes(x=treatment, y=name, fill=value)) +
  geom_tile() +
  theme_bw() +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 0,
    limit = c(-1,1)) +
  theme(axis.text.x = element_text(angle=90)) +
  labs(title = "Module-trait Relationships", y = "Modules", fill="corr")

# pick out a few modules of interest here
modules_of_interest = c("turquoise", "blue", "brown", "grey", "yellow")

# Pull out list of genes in that module
submod = module_df #%>%
  # subset(colors %in% modules_of_interest)

row.names(module_df) = module_df$gene_id

# Get normalized expression for those genes
subexpr = expr_normalized_DE[submod$gene_id,]

submod_df = data.frame(subexpr) %>%
  mutate(
    gene_id = row.names(.)
  ) %>%
  pivot_longer(-gene_id) %>%
  mutate(
    module = module_df[gene_id,]$colors
  )

submod_df %>% ggplot(., aes(x=name, y=value, group=gene_id)) +
  geom_line(aes(color = module),
            alpha = 0.2) +
  theme_bw() +
  theme(
    axis.text.x = element_text(angle = 90)
  ) +
  facet_grid(rows = vars(module), 
             # scales = "free_y"
             ) +
  labs(x = "treatment",
       y = "normalized expression") +
  scale_color_identity() + 
  scale_y_log10()
## Warning: Transformation introduced infinite values in continuous y-axis

Generate and Export Networks

The network file can be generated for Cytoscape or as an edge/vertices file.

genes_of_interest = module_df #%>%
  #subset(colors %in% modules_of_interest)

expr_of_interest = expr_normalized_DE#[genes_of_interest$gene_id,]
expr_of_interest[1:5,1:5]
##         YPS606_Mock_Rep1 YPS606_Mock_Rep2 YPS606_Mock_Rep3 YPS606_Mock_Rep4
## YAL003W      3826.774552      4339.529451       3615.74469       4268.77586
## YAL005C       505.201128       472.097082        680.92864        599.56587
## YAL007C       301.057121       302.063832        259.94362        267.73414
## YAL008W         9.419511         9.703168         10.51713         10.85248
## YAL012W      2563.701428      2279.260052       2429.33302       2347.98552
##         YPS606_EtOH_Rep1
## YAL003W       2705.18447
## YAL005C      25551.22027
## YAL007C        690.08596
## YAL008W         56.01908
## YAL012W       1632.89141
# Only recalculate TOM for modules of interest (faster, altho there's some online discussion if this will be slightly off)
TOM = TOMsimilarityFromExpr(t(expr_of_interest),
                            power = picked_power)
## TOM calculation: adjacency..
## ..will use 4 parallel threads.
##  Fraction of slow calculations: 0.000000
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = row.names(expr_of_interest)
colnames(TOM) = row.names(expr_of_interest)

edge_list = data.frame(TOM) %>%
  mutate(
    gene1 = row.names(.)
  ) %>%
  pivot_longer(-gene1) %>%
  dplyr::rename(gene2 = name, correlation = value) %>%
  unique() %>%
  subset(!(gene1==gene2)) %>%
  mutate(
    module1 = module_df[gene1,]$colors,
    module2 = module_df[gene2,]$colors
  )

head(edge_list)
## # A tibble: 6 Ă— 5
##   gene1   gene2   correlation module1 module2  
##   <chr>   <chr>         <dbl> <chr>   <chr>    
## 1 YAL003W YAL005C     0.0166  blue    turquoise
## 2 YAL003W YAL007C     0.00144 blue    turquoise
## 3 YAL003W YAL008W     0.0479  blue    turquoise
## 4 YAL003W YAL012W     0.00485 blue    blue     
## 5 YAL003W YAL017W     0.0346  blue    turquoise
## 6 YAL003W YAL023C     0.0236  blue    turquoise

What does the distribution of correlations look like?

edge_list %>%
  ggplot(aes(x=correlation)) +
  geom_histogram(bins=60) +
  theme_bw() +
  facet_grid(rows=vars(module1), cols = vars(module2))

What is the correct power for this analysis?

Find ideal power

# Choose a set of soft-thresholding powers
powers = c(c(1:10), seq(from = 12, to = 20, by = 2))

# Call the network topology analysis function
sft = pickSoftThreshold(
  input_mat,             # <= Input data
  #blockSize = 30,
  powerVector = powers,
  verbose = 5
  )
## pickSoftThreshold: will use block size 1972.
##  pickSoftThreshold: calculating connectivity for given powers...
##    ..working on genes 1 through 1972 of 1972
## Warning: executing %dopar% sequentially: no parallel backend registered
##    Power SFT.R.sq     slope truncated.R.sq mean.k. median.k. max.k.
## 1      1 9.35e-01  2.470000          0.955   947.0    1000.0   1250
## 2      2 8.30e-01  0.887000          0.857   598.0     626.0    946
## 3      3 5.10e-01  0.316000          0.489   423.0     427.0    767
## 4      4 7.30e-06  0.000669         -0.188   319.0     304.0    645
## 5      5 5.31e-01 -0.208000          0.398   251.0     225.0    556
## 6      6 7.82e-01 -0.336000          0.720   203.0     170.0    489
## 7      7 9.00e-01 -0.447000          0.876   168.0     133.0    438
## 8      8 9.56e-01 -0.519000          0.944   142.0     106.0    396
## 9      9 9.66e-01 -0.598000          0.961   121.0      85.8    362
## 10    10 9.62e-01 -0.657000          0.955   105.0      70.1    333
## 11    12 9.86e-01 -0.726000          0.983    81.4      48.5    287
## 12    14 9.68e-01 -0.788000          0.961    65.0      34.9    252
## 13    16 9.74e-01 -0.838000          0.970    53.1      25.6    224
## 14    18 9.79e-01 -0.876000          0.979    44.3      19.3    202
## 15    20 9.71e-01 -0.899000          0.968    37.6      15.0    183
par(mfrow = c(1,2));
cex1 = 0.9;

plot(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     xlab = "Soft Threshold (power)",
     ylab = "Scale Free Topology Model Fit, signed R^2",
     main = paste("Scale independence")
)
text(sft$fitIndices[, 1],
     -sign(sft$fitIndices[, 3]) * sft$fitIndices[, 2],
     labels = powers, cex = cex1, col = "red"
)
abline(h = 0.90, col = "red")
plot(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     xlab = "Soft Threshold (power)",
     ylab = "Mean Connectivity",
     type = "n",
     main = paste("Mean connectivity")
)
text(sft$fitIndices[, 1],
     sft$fitIndices[, 5],
     labels = powers,
     cex = cex1, col = "red")

It looks like power=7 or 8 is reasonable. Let’s try with 7.

adjacency = adjacency(input_mat, power=7, type="signed")
adjacency[adjacency < 0] = 0
adjacency[adjacency > 1] = 1
TOM = TOMsimilarity(adjacency, TOMType="signed")
## ..connectivity..
## ..matrix multiplication (system BLAS)..
## ..normalization..
## ..done.
# Add gene names to row and columns
row.names(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
  is.na(GENENAME) ~ ORF,
  TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
colnames(TOM) = AnnotationDbi::select(org.Sc.sgd.db,keys=row.names(adjacency),columns="GENENAME") %>% mutate(label = case_when(
  is.na(GENENAME) ~ ORF,
  TRUE ~ GENENAME
)) %>% pull(label)
## 'select()' returned 1:1 mapping between keys and columns
adj <- TOM
adj[adj > 0.3] = 1
adj[adj != 1] = 0
network <- graph.adjacency(adj)
network <- simplify(network)  # removes self-loops
cor <- WGCNA::cor
results <- WGCNA::blockwiseModules(input_mat, power=7, TOMType="signed", networkType="signed")
cor <- stats::cor
V(network)$color <- results$colors
par(mar=c(0,0,0,0))
# remove unconnected nodes
network <- delete.vertices(network, degree(network)==0)
# plot(network, layout=layout.fruchterman.reingold(network, niter=10000),type="o")
# plot(network, layout=layout_with_mds(network), edge.arrow.size = 0.1)
plot(network, layout=layout.fruchterman.reingold(network, niter=1000),
     vertex.size=3,edge.arrow.size=0.1, 
     vertex.label.cex = 0.01,  # increase this value to see gene names
     vertex.label.color= "black", rescale=TRUE, ylim=c(-1,1),xlim=c(-1,1))

# Convert labels to colors for plotting
mergedColors = labels2colors(results$colors)
# Plot the dendrogram and the module colors underneath
plotDendroAndColors(
  results$dendrograms[[1]],
  mergedColors[results$blockGenes[[1]]],
  "Module colors",
  dendroLabels = FALSE,
  hang = 0.03,
  addGuide = TRUE,
  guideHang = 0.05 )

# Get Module Eigengenes per cluster
MEs0 <- moduleEigengenes(input_mat, mergedColors)$eigengenes

# Reorder modules so similar modules are next to each other
MEs0 <- orderMEs(MEs0)
module_order = names(MEs0) %>% gsub("ME","", .)

# Add treatment names
MEs0$treatment = row.names(MEs0)

# tidy & plot data
mME = MEs0 %>%
  pivot_longer(-treatment) %>%
  mutate(
    name = gsub("ME", "", name),
    name = factor(name, levels = module_order)
  )



mME %>%
  ggplot(., aes(x = treatment, y = name, fill = value)) +
  geom_tile() +
  theme_bw() +
  scale_fill_gradient2(
    low = "blue",
    high = "red",
    mid = "white",
    midpoint = 0,
    limit = c(-1, 1)
  ) +
  theme(axis.text.x = element_text(
    angle = 90,
    hjust = 1,
    vjust = 1
  )) +
  labs(title = "Module-trait Relationships", y = "Modules", fill = "corr")

(
  test_plot <- expr_normalized_DE %>%
  data.frame() %>%
    mutate(
        gene_id = row.names(.)
    ) %>%
    pivot_longer(-gene_id) %>%
    mutate(
        module = results$colors[gene_id]
    ) %>% 
  separate_wider_delim(name, delim = "_", names = c("strain", "stress", "replicate"),cols_remove = FALSE) %>%
  ggplot(., aes(x=name, y=value+1, group=gene_id)) +
    geom_line(aes(color = "grey"), size=1,
            alpha = 0.2) +
    geom_line(aes(color = module),
            alpha = 0.2) +
    # geom_boxplot(width=0.5,
    #              aes(group=name, color = module),
    #              outlier.shape = NA,
    #              alpha=0.9) +
    theme_classic() +
    theme(axis.text.x = element_text(angle = 90)) +
    facet_grid(rows = vars(module), 
             cols= vars(strain),
             scales = "free"
             ) +
    scale_x_discrete(position = "top") +
    labs(x = "treatment",
       y = "normalized expression") +
    # use color variable as the color
    scale_color_identity() + 
    scale_y_log10()
)
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

GO enrichments

# create list of genes in each one
ID_network_list <- expr_normalized_DE %>%
  data.frame() %>%
    mutate(
        gene_id = row.names(.)
    ) %>%
    pivot_longer(-gene_id) %>%
    mutate(
        module = results$colors[gene_id]
    ) %>%
  group_by(module) %>%
  # only get genes that are DE
  # filter(DE_direction != 0) %>%
  group_split() %>% # split into many lists
  purrr::set_names(purrr::map_chr(., ~.x$module[1])) %>% # assign names
  # get just the ORF names
  map(., ~ .x |> pull(all_of("gene_id"))) %>%
  # remove duplicates
  map(., ~ unique(.))
  


GO_enrich_network_all <- clusterProfiler::compareCluster(
                                  geneCluster   = ID_network_list, 
                                  ont           = "BP",
                                  OrgDb         = org.Sc.sgd.db,
                                  pAdjustMethod = "BH",
                                  pvalueCutoff  = 0.5, 
                                  qvalueCutoff  = 0.5,
                                  readable      = FALSE,
                                  fun           = 'enrichGO',
                                  keyType       = 'ORF'
                                  )  |>
  # let's add a 'richFactor' column that gives us the proportion of genes DE in the term
  mutate(richFactor = Count / as.numeric(sub("/\\d+", "", BgRatio)))


# get rid of sticks to save space
GO_enrich_network_all %>%

  as.data.frame() %>% 
  group_by(Cluster) %>%
  slice_min(order_by=pvalue, n = 5) %>%
  ggplot(.,
           aes(Cluster, fct_reorder(Description, richFactor))) +
      geom_point(aes(color = p.adjust, size = Count)) +
      scale_color_gradientn(
        colours = c("#f7ca64", "#46bac2", "#7e62a3"),
        trans = "log10",
        guide = guide_colorbar(reverse = TRUE, order = 1)
      ) +
      scale_size_continuous(range = c(2, 12)) +
      scale_x_discrete(position = "top") +
      scale_y_discrete(labels = function(x) str_wrap(x, width = 25)) +
      xlab("Module") +
      ylab(NULL) +
      ggtitle("Enriched GO Categories by Module") +
      # facet_wrap(~DE, scales = "free_y") +
      theme_bw() +
      theme(text = element_text(size=14),
            axis.text.x = element_text(angle = 90,hjust=0) )

# here are the data to add to cowplot:
GO_summaries <- GO_enrich_network_all %>%
  as.data.frame() %>% 
  group_by(Cluster) %>%
  slice_min(order_by=p.adjust, n = 5) %>%
  slice_head(n = 5) %>%
  mutate(Description = case_when(
    p.adjust < 0.05 ~ paste0(Description," (",Count,")"),
    TRUE ~ " "
  )) %>%
  summarize(description = str_c(Description, collapse = "\n")) %>%
  pull(description)


# draw plot with GO terms
cowplot::ggdraw(test_plot +
                  theme(plot.margin = margin(t = 0.5, r = 5, b = 0.5, l = 0.1, unit = "cm"))) +
  cowplot::draw_text(GO_summaries, 
                     x = 0.76,  # Adjust this value for the desired horizontal position
                     y = rev(seq(0.05, 0.82, length.out = length(GO_summaries))),  # Adjust this sequence for the desired vertical positions
                     size = 6, 
                     hjust = 0,  # Align to the left
                     vjust = 0.5)

Questions

Question 1: Which time point of gene (mRNA) expression showed the highest correlation with changes in protein abundance?

Question 2: Does the pattern of mRNA and protein levels match your expectatoins?

Question 3: How is the intersection between differentially expressed gene sets calculated, and why is it relevant?

Be sure to knit this file into a pdf or html file once you’re finished.

System information for reproducibility:

pander::pander(sessionInfo())

R version 4.3.1 (2023-06-16)

Platform: aarch64-apple-darwin20 (64-bit)

locale: en_US.UTF-8||en_US.UTF-8||en_US.UTF-8||C||en_US.UTF-8||en_US.UTF-8

attached base packages: stats4, stats, graphics, grDevices, utils, datasets, methods and base

other attached packages: igraph(v.1.5.1), cowplot(v.1.1.1), org.Sc.sgd.db(v.3.18.0), AnnotationDbi(v.1.64.1), IRanges(v.2.36.0), S4Vectors(v.0.40.1), Biobase(v.2.62.0), BiocGenerics(v.0.48.1), matrixStats(v.1.0.0), edgeR(v.4.0.1), limma(v.3.58.1), WGCNA(v.1.72-1), fastcluster(v.1.2.3), dynamicTreeCut(v.1.63-1), reactable(v.0.4.4), Glimma(v.2.12.0), statmod(v.1.5.0), BiocManager(v.1.30.22), pander(v.0.6.5), knitr(v.1.45), lubridate(v.1.9.3), forcats(v.1.0.0), stringr(v.1.5.0), dplyr(v.1.1.3), purrr(v.1.0.2), readr(v.2.1.4), tidyr(v.1.3.0), tibble(v.3.2.1), ggplot2(v.3.4.4), tidyverse(v.2.0.0) and pacman(v.0.5.1)

loaded via a namespace (and not attached): splines(v.4.3.1), later(v.1.3.1), ggplotify(v.0.1.2), bitops(v.1.0-7), filelock(v.1.0.2), polyclip(v.1.10-6), preprocessCore(v.1.64.0), rpart(v.4.1.21), lifecycle(v.1.0.3), doParallel(v.1.0.17), lattice(v.0.22-5), vroom(v.1.6.4), MASS(v.7.3-60), backports(v.1.4.1), magrittr(v.2.0.3), Hmisc(v.5.1-1), sass(v.0.4.7), rmarkdown(v.2.25), jquerylib(v.0.1.4), yaml(v.2.3.7), httpuv(v.1.6.12), RColorBrewer(v.1.1-3), DBI(v.1.1.3), abind(v.1.4-5), zlibbioc(v.1.48.0), GenomicRanges(v.1.54.1), ggraph(v.2.1.0), RCurl(v.1.98-1.13), yulab.utils(v.0.1.0), nnet(v.7.3-19), tweenr(v.2.0.2), rappdirs(v.0.3.3), GenomeInfoDbData(v.1.2.11), enrichplot(v.1.22.0), ggrepel(v.0.9.4), tidytree(v.0.4.5), codetools(v.0.2-19), DelayedArray(v.0.28.0), ggforce(v.0.4.1), DOSE(v.3.28.0), tidyselect(v.1.2.0), aplot(v.0.2.2), farver(v.2.1.1), viridis(v.0.6.4), BiocFileCache(v.2.10.1), base64enc(v.0.1-3), jsonlite(v.1.8.7), ellipsis(v.0.3.2), tidygraph(v.1.2.3), Formula(v.1.2-5), survival(v.3.5-7), iterators(v.1.0.14), foreach(v.1.5.2), tools(v.4.3.1), treeio(v.1.26.0), HPO.db(v.0.99.2), Rcpp(v.1.0.11), glue(v.1.6.2), gridExtra(v.2.3), SparseArray(v.1.2.2), xfun(v.0.41), DESeq2(v.1.42.0), qvalue(v.2.34.0), MatrixGenerics(v.1.14.0), GenomeInfoDb(v.1.38.0), withr(v.2.5.2), fastmap(v.1.1.1), fansi(v.1.0.5), digest(v.0.6.33), gridGraphics(v.0.5-1), timechange(v.0.2.0), R6(v.2.5.1), mime(v.0.12), colorspace(v.2.1-0), GO.db(v.3.18.0), RSQLite(v.2.3.2), utf8(v.1.2.4), generics(v.0.1.3), data.table(v.1.14.8), graphlayouts(v.1.0.1), httr(v.1.4.7), htmlwidgets(v.1.6.2), S4Arrays(v.1.2.0), scatterpie(v.0.2.1), pkgconfig(v.2.0.3), gtable(v.0.3.4), blob(v.1.2.4), impute(v.1.76.0), XVector(v.0.42.0), shadowtext(v.0.1.2), clusterProfiler(v.4.10.0), htmltools(v.0.5.7), fgsea(v.1.28.0), scales(v.1.2.1), png(v.0.1-8), ggfun(v.0.1.3), rstudioapi(v.0.15.0), tzdb(v.0.4.0), reshape2(v.1.4.4), nlme(v.3.1-163), checkmate(v.2.3.0), curl(v.5.1.0), cachem(v.1.0.8), BiocVersion(v.3.18.0), parallel(v.4.3.1), HDO.db(v.0.99.1), foreign(v.0.8-85), pillar(v.1.9.0), grid(v.4.3.1), vctrs(v.0.6.4), promises(v.1.2.1), dbplyr(v.2.4.0), xtable(v.1.8-4), cluster(v.2.1.4), htmlTable(v.2.4.2), evaluate(v.0.23), cli(v.3.6.1), locfit(v.1.5-9.8), compiler(v.4.3.1), rlang(v.1.1.1), crayon(v.1.5.2), labeling(v.0.4.3), fs(v.1.6.3), plyr(v.1.8.9), stringi(v.1.7.12), viridisLite(v.0.4.2), BiocParallel(v.1.36.0), MPO.db(v.0.99.7), munsell(v.0.5.0), Biostrings(v.2.70.1), lazyeval(v.0.2.2), GOSemSim(v.2.28.0), Matrix(v.1.6-1.1), patchwork(v.1.1.3), hms(v.1.1.3), bit64(v.4.0.5), KEGGREST(v.1.42.0), shiny(v.1.7.5.1), SummarizedExperiment(v.1.32.0), interactiveDisplayBase(v.1.40.0), highr(v.0.10), AnnotationHub(v.3.10.0), memoise(v.2.0.1), bslib(v.0.5.1), ggtree(v.3.10.0), fastmatch(v.1.1-4), bit(v.4.0.5), gson(v.0.1.0) and ape(v.5.7-1)